import numpy as np
import pandas as pd
import os
import sys
import pickle
from joblib import dump, load
import yaml
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LinearRegression as reg
import pdb
from pprint import pprint
from statistics import mean
import random
random.seed(50)
import math
import argparse
from tqdm import tqdm

defaultaxpayer_id='/REDACTED/fairness/code/rf/data/'
defaultout='/REDACTED/'

def get_feature_names(configpath='/REDACTED/fairness/code/rf/config/',
                      datapath='/REDACTED/fairness/code/rf/data/',
                      dep_database=False):
    stream=open(configpath+'data-config.yaml', 'r')
    out = yaml.load(stream)
    
    if dep_database==False:
        df=pd.read_csv(datapath+'clean_rf_data.csv')
        feature_vars = [x for x in out['features_str'] if x in df.columns]
        features = df[feature_vars]
    
    elif dep_database==True:
        df=pd.read_csv(datapath+'clean_rf_data_plus_dep_database.csv')
        feature_vars = [x for x in out['features_plus_dep_database_str'] if x in df.columns]
        features = df[feature_vars]
        
    feature_names=features.columns
        
    return feature_names

def get_feature_importance(indir=defaultaxpayer_id,
                            outdir=defaultout,
                            modelname='EITC_NCMP_RF_Class_100_plus_dep_database',
                            feature_names=None):
    model=load(defaultaxpayer_id+modelname+'.joblib')
    importances=pd.Series(model.feature_importances_, index=feature_names)
    importances.nlargest(20).plot(kind='barh', title=modelname).get_figure().savefig(outdir+modelname+'_feat_imp.png', bbox_inches='tight')
    importances.to_csv(outdir+modelname+'_importances.csv')
    return importances

#feature_names = get_feature_names(dep_database=False)
feature_names_dep_database = get_feature_names(dep_database=True)

# get feature importances for refundable credit prediction model
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_0_outcome_ref_cred_amt_dif_pv', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_1_outcome_ref_cred_amt_dif_pv', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_2_outcome_ref_cred_amt_dif_pv', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_3_outcome_ref_cred_amt_dif_pv', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_4_outcome_ref_cred_amt_dif_pv', feature_names=feature_names_dep_database)

# read in importances
fold0 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_0_outcome_ref_cred_amt_dif_pv_importances.csv')
fold1 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_1_outcome_ref_cred_amt_dif_pv_importances.csv')
fold1 = fold1.rename(columns={'0': '1'})
fold2 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_2_outcome_ref_cred_amt_dif_pv_importances.csv')
fold2 = fold2.rename(columns={'0': '2'})
fold3 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_3_outcome_ref_cred_amt_dif_pv_importances.csv')
fold3 = fold3.rename(columns={'0': '3'})
fold4 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_4_outcome_ref_cred_amt_dif_pv_importances.csv')
fold4 = fold4.rename(columns={'0': '4'})

# merge together results from different folds and average
folds = fold0.merge(fold1, how = 'left', on = 'Unnamed: 0')
folds = folds.merge(fold2, how = 'left', on = 'Unnamed: 0')
folds = folds.merge(fold3, how = 'left', on = 'Unnamed: 0')
folds = folds.merge(fold4, how = 'left', on = 'Unnamed: 0')

folds['importance_avg'] = (folds['0'] + folds['1'] + folds['2'] + folds['3'] + folds['4']) / 5

folds = folds.rename(columns={'Unnamed: 0': 'feature'})

# grab top 40
importances=pd.Series(folds['importance_avg'].tolist(), index=folds['feature'])
importances.nlargest(40).plot(kind='barh', title='Feature Importance (Average of 5 Folds)').get_figure().savefig('/REDACTED/avg_feat_imp_test.png', bbox_inches='tight')
important_features = importances.nlargest(40).reset_index().feature.tolist()
important_features.sort()

## Total Underreportaxpayer_idg (above is overclaiming)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_0', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_1', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_2', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_3', feature_names=feature_names_dep_database)
get_feature_importance(modelname='EITC_NCMP_RF_Reg_plus_dep_database_train_set_4', feature_names=feature_names_dep_database)

# read in importances
fold0 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_0_importances.csv')
fold1 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_1_importances.csv')
fold1 = fold1.rename(columns={'0': '1'})
fold2 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_2_importances.csv')
fold2 = fold2.rename(columns={'0': '2'})
fold3 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_3_importances.csv')
fold3 = fold3.rename(columns={'0': '3'})
fold4 = pd.read_csv('/REDACTED/EITC_NCMP_RF_Reg_plus_dep_database_train_set_4_importances.csv')
fold4 = fold4.rename(columns={'0': '4'})

# merge together results from different folds and average
folds = fold0.merge(fold1, how = 'left', on = 'Unnamed: 0')
folds = folds.merge(fold2, how = 'left', on = 'Unnamed: 0')
folds = folds.merge(fold3, how = 'left', on = 'Unnamed: 0')
folds = folds.merge(fold4, how = 'left', on = 'Unnamed: 0')

folds['importance_avg'] = (folds['0'] + folds['1'] + folds['2'] + folds['3'] + folds['4']) / 5

folds = folds.rename(columns={'Unnamed: 0': 'feature'})

# grab top 40
importances=pd.Series(folds['importance_avg'].tolist(), index=folds['feature'])
importances.nlargest(40).plot(kind='barh', title='Feature Importance (Average of 5 Folds)').get_figure().savefig('/REDACTED/underreportaxpayer_idg_avg_feat_imp.png', bbox_inches='tight')
important_features = importances.nlargest(40).reset_index().feature.tolist()
important_features.sort()

